# Libraries
import pandas as pd

# Path
path="E:/articulos/argentina-ghg/"

# Read the "Extracción" sheet from the specified Excel file into a DataFrame
df = pd.read_excel(path+'/input/C10.2 - Medio Ambiente - Valores.xlsx',sheet_name="Extracción",header=2)

df.head

<bound method NDFrame.head of     Unnamed: 0         Unnamed: 1 Unnamed: 2  Unnamed: 3  Unnamed: 4  \
0          NaN              Total    Nativas  Cultivadas         NaN   
1          NaN                NaN        NaN         NaN         NaN   
2          NaN  En metros cúbicos        NaN         NaN         NaN   
3       1986.0                  …          …           …         NaN   
4       1987.0                  …          …           …         NaN   
5       1988.0                  …          …           …         NaN   
6       1989.0                  …          …           …         NaN   
7       1990.0          3097050.0  2478103.0    618947.0         NaN   
8       1991.0          3240051.0  2675047.0    565004.0         NaN   
9       1992.0          3908746.0  3347834.0    560912.0         NaN   
10      1993.0          3498502.0  3054716.0    443786.0         NaN   
11      1994.0          4448173.0  2993473.0   1454700.0         NaN   
12      1995.0          3571250.0  2317731.0   1253519.0         NaN   
13      1996.0          3874679.0  2768557.0   1106122.0         NaN   
14      1997.0          3977727.0  2810697.0   1167030.0         NaN   
15      1998.0          5176119.0  3949294.0   1226825.0         NaN   
16      1999.0          3585913.0  3574234.0     11679.0         NaN   
17      2000.0          2993381.0  2941514.0     51867.0         NaN   
18      2001.0          3295933.0  3215059.0     80874.0         NaN   
19      2002.0          2798112.0  2753085.0     45027.0         NaN   
20      2003.0          3589467.0  3518700.0     70767.0         NaN   
21      2004.0          5594013.0  5511633.0     82380.0         NaN   
22      2005.0          4380125.0  4317141.0     62984.0         NaN   
23      2006.0          4383078.0  4261817.0    121261.0         NaN   
24      2007.0          4293650.0  4199885.0     93765.0         NaN   
25      2008.0          4362429.0  4247329.0    115100.0         NaN   
26      2009.0          4267153.0  4178953.0     88200.0         NaN   
27      2010.0          4374522.0  4298852.0     75670.0         NaN   
28      2011.0          4547140.0  4490604.0     56536.0         NaN   
29      2012.0          3992571.0  3917673.0     74898.0         NaN   
30      2013.0          4620379.0  4496117.0    124262.0         NaN   
31      2014.0          4465917.0  4023265.0    442652.0         NaN   
32      2015.0                  …          …           …         NaN   

    Unnamed: 5      Unnamed: 6      Unnamed: 7  
0        Total         Nativas      Cultivadas  
1          NaN             NaN             NaN  
2          NaN             NaN             NaN  
3            …               …  4203342.649498  
4            …               …  4595339.899083  
5            …               …  5035727.256989  
6            …               …  5248057.424235  
7    7197209.0   1998336.76955   5198872.23045  
8    6807986.0  1086776.497526  5721209.502474  
9    7487950.0  1400611.581165  6087338.418835  
10   6989692.0  2172706.411691  4816985.588309  
11   6524538.0       1103694.0       5420844.0  
12   7907132.0       1186640.0       6720492.0  
13   7131330.0       1103672.0       6027658.0  
14   6533743.0        970156.0       5563587.0  
15   7387801.0       1132413.0       6255388.0  
16   6660920.0        873745.0       5787175.0  
17   8402461.0        769238.0       7633223.0  
18   6992278.0        629703.0       6362575.0  
19   9876226.0        798957.0       9077269.0  
20  10650807.0        904489.0       9746318.0  
21  10606874.0       1063733.0       9543141.0  
22  11227125.0       1189144.0      10037981.0  
23  10644142.0       1075712.0       9568430.0  
24  10516799.0        871214.0       9645585.0  
25   9838734.0        789066.0       9049668.0  
26  11046078.0        582782.0      10463296.0  
27  11894019.0        528306.0      11365713.0  
28  11944977.0        540499.0      11404478.0  
29  14429178.0        543629.0      13885549.0  
30  15411751.0        474779.0      14936972.0  
31  12266941.0        527316.0      11739625.0  
32           …               …               …  >


# Remove unnecessary rows and columns, and reset the index
cleaned_data = df.iloc[2:].dropna(how='all').reset_index(drop=True)

# Rename columns for easier reference
column_names = ["Year", "Firewood_Total", "Firewood_Native", "Firewood_Cultivated", "Logs_Total", "Logs_Native", "Logs_Cultivated", "Extra"]
cleaned_data.columns = column_names

# Drop the "Extra" column that was included due to additional NaN values
cleaned_data.drop(columns=['Extra'], inplace=True)

# Keep only relevant columns related to "Firewood"
relevant_columns = ["Year", "Firewood_Total", "Firewood_Native", "Firewood_Cultivated"]
cleaned_data = cleaned_data[relevant_columns]

# Remove rows with non-numeric "Year" or "Firewood_Total"
cleaned_data = cleaned_data[pd.to_numeric(cleaned_data['Year'], errors='coerce').notnull()]
cleaned_data = cleaned_data[cleaned_data['Firewood_Total'] != '…']

# Reset the index after removing rows
cleaned_data.reset_index(drop=True, inplace=True)

# Display the cleaned data
print(cleaned_data.head())

# Convert "Year" to integer
cleaned_data['Year'] = cleaned_data['Year'].astype(int)

# Convert the other columns to string first, then replace the dots and convert to integers
for col in ["Firewood_Total", "Firewood_Native", "Firewood_Cultivated"]:
    cleaned_data[col] = cleaned_data[col].astype(str).str.replace('.', '', regex=False).astype(int)

# Verify the data types and first few rows of the cleaned data
cleaned_data.dtypes, cleaned_data.head()

     Year Firewood_Total Firewood_Native Firewood_Cultivated
0  1990.0      3097050.0       2478103.0            618947.0
1  1991.0      3240051.0       2675047.0            565004.0
2  1992.0      3908746.0       3347834.0            560912.0
3  1993.0      3498502.0       3054716.0            443786.0
4  1994.0      4448173.0       2993473.0           1454700.0

(Year                   int32
 Firewood_Total         int32
 Firewood_Native        int32
 Firewood_Cultivated    int32
 dtype: object,
    Year  Firewood_Total  Firewood_Native  Firewood_Cultivated
 0  1990        30970500         24781030              6189470
 1  1991        32400510         26750470              5650040
 2  1992        39087460         33478340              5609120
 3  1993        34985020         30547160              4437860
 4  1994        44481730         29934730             14547000)

Basic cleaning of an excel dataset¶

Juan Andrés Cabral¶